{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.1.0\n",
      "sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)\n",
      "matplotlib 3.1.3\n",
      "numpy 1.18.1\n",
      "pandas 1.0.1\n",
      "sklearn 0.22.1\n",
      "tensorflow 2.1.0\n",
      "tensorflow_core.python.keras.api._v2.keras 2.2.4-tf\n"
     ]
    }
   ],
   "source": [
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import sklearn\n",
    "import pandas as pd\n",
    "import os\n",
    "import sys\n",
    "import time\n",
    "import tensorflow as tf\n",
    "\n",
    "from tensorflow import keras\n",
    "\n",
    "print(tf.__version__)\n",
    "print(sys.version_info)\n",
    "for module in mpl, np, pd, sklearn, tf, keras:\n",
    "    print(module.__name__, module.__version__)\n",
    "\n",
    "'''\n",
    "从基础数据中创建dataset  \n",
    "tf.data.Dataset.from_tensor_slices()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 2)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-e40ecb5b427a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cat'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'dog'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'fox'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mb\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 2)"
     ]
    }
   ],
   "source": [
    "# x = np.array([[1, 2], [3, 4], [5, 6]])\n",
    "# y = np.array(['cat', 'dog', 'fox'])\n",
    "# data=(x,y)\n",
    "# for a,b in data:\n",
    "#     print(a,b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<TensorSliceDataset shapes: (), types: tf.int64>\n"
     ]
    }
   ],
   "source": [
    "## 创建dataset，从numpy数组中创建dataset\n",
    "## 每个dataset创建时执行了两步：1. 把numpy书中中的每个item变成tensor张量格式。2. 将numpy变成dataset\n",
    "dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(0, shape=(), dtype=int64)\n",
      "tf.Tensor(1, shape=(), dtype=int64)\n",
      "tf.Tensor(2, shape=(), dtype=int64)\n",
      "tf.Tensor(3, shape=(), dtype=int64)\n",
      "tf.Tensor(4, shape=(), dtype=int64)\n",
      "tf.Tensor(5, shape=(), dtype=int64)\n",
      "tf.Tensor(6, shape=(), dtype=int64)\n",
      "tf.Tensor(7, shape=(), dtype=int64)\n",
      "tf.Tensor(8, shape=(), dtype=int64)\n",
      "tf.Tensor(9, shape=(), dtype=int64)\n"
     ]
    }
   ],
   "source": [
    "## 可以将dataset看作一个迭代器，可以使用循环查看\n",
    "## 其实其内部就是一个个的item，类似于数组\n",
    "for item in dataset:\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)\n",
      "tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)\n",
      "tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int64)\n",
      "tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int64)\n",
      "tf.Tensor([8 9], shape=(2,), dtype=int64)\n"
     ]
    }
   ],
   "source": [
    "# 1. repeat epoch\n",
    "# 2. get batch\n",
    "\n",
    "## dataset api\n",
    "## repeat，将dataset中的item循环3次\n",
    "## batch表示将数据 按照七个一组，组装起来，可以理解为 维度增加了，从向量变成了矩阵\n",
    "dataset = dataset.repeat(3).batch(7)\n",
    "for item in dataset:\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(0, shape=(), dtype=int64)\n",
      "tf.Tensor(1, shape=(), dtype=int64)\n",
      "tf.Tensor(2, shape=(), dtype=int64)\n",
      "tf.Tensor(3, shape=(), dtype=int64)\n",
      "tf.Tensor(4, shape=(), dtype=int64)\n",
      "tf.Tensor(7, shape=(), dtype=int64)\n",
      "tf.Tensor(8, shape=(), dtype=int64)\n",
      "tf.Tensor(9, shape=(), dtype=int64)\n",
      "tf.Tensor(0, shape=(), dtype=int64)\n",
      "tf.Tensor(1, shape=(), dtype=int64)\n",
      "tf.Tensor(4, shape=(), dtype=int64)\n",
      "tf.Tensor(5, shape=(), dtype=int64)\n",
      "tf.Tensor(6, shape=(), dtype=int64)\n",
      "tf.Tensor(7, shape=(), dtype=int64)\n",
      "tf.Tensor(8, shape=(), dtype=int64)\n",
      "tf.Tensor(1, shape=(), dtype=int64)\n",
      "tf.Tensor(2, shape=(), dtype=int64)\n",
      "tf.Tensor(3, shape=(), dtype=int64)\n",
      "tf.Tensor(4, shape=(), dtype=int64)\n",
      "tf.Tensor(5, shape=(), dtype=int64)\n",
      "tf.Tensor(8, shape=(), dtype=int64)\n",
      "tf.Tensor(9, shape=(), dtype=int64)\n",
      "tf.Tensor(5, shape=(), dtype=int64)\n",
      "tf.Tensor(6, shape=(), dtype=int64)\n",
      "tf.Tensor(2, shape=(), dtype=int64)\n",
      "tf.Tensor(3, shape=(), dtype=int64)\n",
      "tf.Tensor(9, shape=(), dtype=int64)\n",
      "tf.Tensor(0, shape=(), dtype=int64)\n",
      "tf.Tensor(6, shape=(), dtype=int64)\n",
      "tf.Tensor(7, shape=(), dtype=int64)\n"
     ]
    }
   ],
   "source": [
    "# interleave: \n",
    "# case: 文件dataset -> 具体数据集\n",
    "\n",
    "## interleave，循环遍历dataset中的每个item，对其中的item进行处理（处理的结果也是个dastaset）\n",
    "## 最后将所有的dataset按照block_length的形式组装起来 --达到了均匀混合的效果--\n",
    "## \n",
    "dataset2 = dataset.interleave(\n",
    "    lambda v: tf.data.Dataset.from_tensor_slices(v), # map_fn\n",
    "    ## 并行处理\n",
    "    cycle_length = 5, # cycle_length\n",
    "    block_length = 5, # block_length\n",
    ")\n",
    "for item in dataset2:\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<TensorSliceDataset shapes: ((2,), ()), types: (tf.int64, tf.string)>\n",
      "[1 2] b'cat'\n",
      "[3 4] b'dog'\n",
      "[5 6] b'fox'\n"
     ]
    }
   ],
   "source": [
    "x = np.array([[1, 2], [3, 4], [5, 6]])\n",
    "y = np.array(['cat', 'dog', 'fox'])\n",
    "## dataset的数据来源是 元组\n",
    "## 在内部会进行拼接操作，[(a.item1,b.item1),(a.item2,b.item2)]\n",
    "dataset3 = tf.data.Dataset.from_tensor_slices((x, y))\n",
    "print(dataset3)\n",
    "\n",
    "for item_x, item_y in dataset3:\n",
    "    \n",
    "    print(item_x.numpy(), item_y.numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 2] b'cat'\n",
      "[3 4] b'dog'\n",
      "[5 6] b'fox'\n"
     ]
    }
   ],
   "source": [
    "## 从字典创建dataset\n",
    "dataset4 = tf.data.Dataset.from_tensor_slices({\"feature\": x,\n",
    "                                               \"label\": y})\n",
    "for item in dataset4:\n",
    "    print(item[\"feature\"].numpy(), item[\"label\"].numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
